Data Quality Report

Assessment of data quality and completeness in preparation for analysis

1. Read file in R

Three datasets were provided by Sprocket Central Pty Ltd which are: Customer Demographic, Customer Addresses, Transactions data in the past 3 months

#' Read file
transaction <- readxl::read_excel(path = file.path("temp/KPMG/KPMG_VI_New_raw_data_update_final.xlsx"),
                          sheet = 2,
                          skip =1)

customer_demographic <- readxl::read_excel(path = file.path("temp/KPMG/KPMG_VI_New_raw_data_update_final.xlsx"),
                                   sheet = 4,
                                   skip =1)

customer_address <- readxl::read_excel(path = file.path("temp/KPMG/KPMG_VI_New_raw_data_update_final.xlsx"),
                              sheet = 5,
                              skip =1)

2. Data Quality dimensions

Data quality meets seven dimensions: accuracy, completeness, consistency, currentcy, relevancy, validity, and uniqueness.

2.1 Accuracy

2.1.1. Transactions

#' Transactions
summary(transaction)
##  transaction_id    product_id      customer_id    
##  Min.   :    1   Min.   :  0.00   Min.   :   1.0  
##  1st Qu.: 5001   1st Qu.: 18.00   1st Qu.: 857.8  
##  Median :10000   Median : 44.00   Median :1736.0  
##  Mean   :10000   Mean   : 45.36   Mean   :1738.2  
##  3rd Qu.:15000   3rd Qu.: 72.00   3rd Qu.:2613.0  
##  Max.   :20000   Max.   :100.00   Max.   :5034.0  
##                                                   
##  transaction_date              online_order    order_status      
##  Min.   :2017-01-01 00:00:00   Mode :logical   Length:20000      
##  1st Qu.:2017-04-01 00:00:00   FALSE:9811      Class :character  
##  Median :2017-07-03 00:00:00   TRUE :9829      Mode  :character  
##  Mean   :2017-07-01 14:08:05   NA's :360                         
##  3rd Qu.:2017-10-02 00:00:00                                     
##  Max.   :2017-12-30 00:00:00                                     
##                                                                  
##     brand           product_line       product_class      product_size      
##  Length:20000       Length:20000       Length:20000       Length:20000      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    list_price      standard_cost     product_first_sold_date
##  Min.   :  12.01   Min.   :   7.21   Min.   :33259          
##  1st Qu.: 575.27   1st Qu.: 215.14   1st Qu.:35667          
##  Median :1163.89   Median : 507.58   Median :38216          
##  Mean   :1107.83   Mean   : 556.05   Mean   :38200          
##  3rd Qu.:1635.30   3rd Qu.: 795.10   3rd Qu.:40672          
##  Max.   :2091.47   Max.   :1759.85   Max.   :42710          
##                    NA's   :197       NA's   :197

2.1.2. Customer Demographics

#' Customer Demographics
summary(customer_demographic)
##   customer_id    first_name         last_name            gender         
##  Min.   :   1   Length:4000        Length:4000        Length:4000       
##  1st Qu.:1001   Class :character   Class :character   Class :character  
##  Median :2000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2000                                                           
##  3rd Qu.:3000                                                           
##  Max.   :4000                                                           
##                                                                         
##  past_3_years_bike_related_purchases     DOB             job_title        
##  Min.   : 0.00                       Length:4000        Length:4000       
##  1st Qu.:24.00                       Class :character   Class :character  
##  Median :48.00                       Mode  :character   Mode  :character  
##  Mean   :48.89                                                            
##  3rd Qu.:73.00                                                            
##  Max.   :99.00                                                            
##                                                                           
##  job_industry_category wealth_segment     deceased_indicator   default         
##  Length:4000           Length:4000        Length:4000        Length:4000       
##  Class :character      Class :character   Class :character   Class :character  
##  Mode  :character      Mode  :character   Mode  :character   Mode  :character  
##                                                                                
##                                                                                
##                                                                                
##                                                                                
##    owns_car             tenure     
##  Length:4000        Min.   : 1.00  
##  Class :character   1st Qu.: 6.00  
##  Mode  :character   Median :11.00  
##                     Mean   :10.66  
##                     3rd Qu.:15.00  
##                     Max.   :22.00  
##                     NA's   :87

2.1.3. Customer Address

#' Customer Address
summary(customer_address)
##   customer_id     address             postcode       state          
##  Min.   :   1   Length:3999        Min.   :2000   Length:3999       
##  1st Qu.:1004   Class :character   1st Qu.:2200   Class :character  
##  Median :2004   Mode  :character   Median :2768   Mode  :character  
##  Mean   :2004                      Mean   :2986                     
##  3rd Qu.:3004                      3rd Qu.:3750                     
##  Max.   :4003                      Max.   :4883                     
##    country          property_valuation
##  Length:3999        Min.   : 1.000    
##  Class :character   1st Qu.: 6.000    
##  Mode  :character   Median : 8.000    
##                     Mean   : 7.514    
##                     3rd Qu.:10.000    
##                     Max.   :12.000

2.2 Completeness

2.2.1. Transactions

#' Transactions
transaction %>% dlookr::diagnose() %>%  
  tidytable::select.(-unique_count, -unique_rate) %>% 
  tidytable::filter.(missing_count > 0) %>% 
  tidytable::arrange.(desc(missing_count)) %>% 
  reactable::reactable()
## Warning in fun(libname, pkgname): couldn't connect to display ":0"

2.2.2. Customer Demographics

#' Transactions
customer_demographic %>% dlookr::diagnose() %>%  
  tidytable::select.(-unique_count, -unique_rate) %>% 
  tidytable::filter.(missing_count > 0) %>% 
  tidytable::arrange.(desc(missing_count)) %>% 
  reactable::reactable()

2.2.3. Customer Address

#' Transactions
customer_address %>% dlookr::diagnose() %>%  
  tidytable::select.(-unique_count, -unique_rate) %>% 
  tidytable::filter.(missing_count > 0) %>% 
  tidytable::arrange.(desc(missing_count)) %>% 
  reactable::reactable()

2.3 Consistency

#' Table col
  list(transaction, customer_address, customer_demographic) %>% purrr::map(~ (names(.))) %>% 
  set_names(nm = c("transaction", "customer_address", "customer_demographic"))
## $transaction
##  [1] "transaction_id"          "product_id"             
##  [3] "customer_id"             "transaction_date"       
##  [5] "online_order"            "order_status"           
##  [7] "brand"                   "product_line"           
##  [9] "product_class"           "product_size"           
## [11] "list_price"              "standard_cost"          
## [13] "product_first_sold_date"
## 
## $customer_address
## [1] "customer_id"        "address"            "postcode"          
## [4] "state"              "country"            "property_valuation"
## 
## $customer_demographic
##  [1] "customer_id"                         "first_name"                         
##  [3] "last_name"                           "gender"                             
##  [5] "past_3_years_bike_related_purchases" "DOB"                                
##  [7] "job_title"                           "job_industry_category"              
##  [9] "wealth_segment"                      "deceased_indicator"                 
## [11] "default"                             "owns_car"                           
## [13] "tenure"

Entity Relationship Diagrams

#' Plot
table_dm <- dm::dm(transaction, customer_demographic, customer_address)
table_dm %>% 
  dm::dm_add_pk(transaction, customer_id) %>% 
  dm::dm_add_pk(customer_address, customer_id) %>% 
  dm::dm_add_pk(customer_demographic, customer_id) %>% 
  dm::dm_add_fk(transaction, customer_id, customer_address) %>% 
  dm::dm_add_fk(transaction, customer_id, customer_demographic) %>% dm::dm_draw(view_type = "all")

2.4 Validity

2.4.1. Transactions

#' Transactions
transaction_validate <- transaction %>% tidytable::as_tidytable() %>%
  tidytable::mutate.(
    transaction_date = as.Date(transaction_date),
    product_first_sold_date = as.Date(product_first_sold_date, origin = "1899-12-30")) %>%
  tidytable::arrange.(-product_first_sold_date)
transaction_validate %>% reactable::reactable()

#### 2.4.2. Customer Demographics

## Warning in as.Date(as.numeric(DOB), origin = "1899-12-30"): NAs introduced by
## coercion

2.5 Consistency

2.5.1 Transactions

2.6 Uniqueness

2.6.1. Transactions

#' Transactions
transaction %>% dlookr::diagnose() %>%  
  tidytable::select.(-missing_count,-missing_percent) %>% 
  tidytable::arrange.(-unique_count) %>% reactable::reactable()

2.6.2. Customer Demographics

#' Customer Demographics
customer_demographic %>% dlookr::diagnose() %>%  
  tidytable::select.(-missing_count,-missing_percent) %>% 
  tidytable::arrange.(-unique_count) %>% reactable::reactable()

2.6.3. Customer Address

#' Customer Address
customer_address %>% dlookr::diagnose() %>%  
  tidytable::select.(-missing_count,-missing_percent) %>% 
  tidytable::arrange.(-unique_count) %>% reactable::reactable()